Nebula 1

home *** CD-ROM | disk | FTP | other *** search

/ Nebula 1 / Nebula One.iso / Internet / WWW / swish.11 / src / merge.c < prev next >

Wrap

C/C++ Source or Header | 1995-03-13 | 13KB | 638 lines

/* ** Copyright (C) 1995, Enterprise Integration Technologies Corp. ** All Rights Reserved. ** Kevin Hughes, kevinh@eit.com ** 3/11/94 */ #include "swish.h" #include "merge.h" /* The main merge functions - it accepts three file names. ** This is a bit hairy. It basically acts as a zipper, ** zipping up both index files into one. */ void readmerge(file1, file2, outfile) char *file1; char *file2; char *outfile; { int i, j, indexfilenum1, indexfilenum2, result, totalfiles, skipwords, skipfiles; long limit1, limit2, fileinfo1, fileinfo2, offsetstart; char line[MAXSTRLEN]; struct indexentry *ip1, *ip2, *ip3; struct indexentry *buffer1, *buffer2; FILE *fp1, *fp2, *fp3; initindexfilehashlist(); if ((fp1 = fopen(file1, "r")) == NULL) { sprintf(errorstr, "Couldn't read the index file \"%s\".", file1); progerr(errorstr); } if (!isokindexheader(fp1)) { sprintf(errorstr, "\"%s\" has an unknown format.", file1); progerr(errorstr); } if ((fp2 = fopen(file2, "r")) == NULL) { sprintf(errorstr, "Couldn't read the index file \"%s\".", file2); progerr(errorstr); } if (!isokindexheader(fp2)) { sprintf(errorstr, "\"%s\" has an unknown format.", file2); progerr(errorstr); } ip1 = ip2 = ip3 = NULL; buffer1 = buffer2 = NULL; if (verbose) printf("Counting files... "); indexfilenum1 = getindexfilenum(fp1); indexfilenum2 = getindexfilenum(fp2); totalfiles = indexfilenum1 + indexfilenum2; if (verbose) { printf("%d files.\n", indexfilenum1 + indexfilenum2); printf("Reading stopwords..."); } readoffsets(fp1); readstopwords(fp1); limit1 = offsets[STOPWORDPOS]; fileinfo1 = offsets[FILELISTPOS]; readoffsets(fp2); readstopwords(fp2); limit2 = offsets[STOPWORDPOS]; fileinfo2 = offsets[FILELISTPOS]; if (verbose) printf("\nReading file info..."); fseek(fp1, fileinfo1, 0); for (i = 1; i <= indexfilenum1; i++) { fgets(line, MAXSTRLEN, fp1); addindexfilelist(i, line, &totalfiles); } fseek(fp2, fileinfo2, 0); for (i = 1; i <= indexfilenum2; i++) { fgets(line, MAXSTRLEN, fp2); addindexfilelist(i + indexfilenum1, line, &totalfiles); } if ((fp3 = fopen(outfile, "w")) == NULL) { sprintf(errorstr, "Couldn't write the merged index file \"%s\".", outfile); progerr(errorstr); } if (verbose) printf("\nMerging words... "); printheader(fp3, outfile, 0, totalfiles); offsetstart = ftell(fp3); for (i = 0; i < MAXCHARS; i++) fprintf(fp3, "%016li", offsets[i]); fputc('\n', fp3); readoffsets(fp1); readoffsets(fp2); for (i = 0; i < MAXCHARS; i++) offsets[i] = 0; skipwords = 0; while (1) { if (buffer1 == NULL) { ip1 = (struct indexentry *) readindexline(fp1, limit1); if (ip1 == NULL) break; buffer1 = ip1; } if (buffer2 == NULL) { ip2 = (struct indexentry *) readindexline(fp2, limit2); if (ip2 == NULL) break; addfilenums(ip2, indexfilenum1); buffer2 = ip2; } result = wordcompare(ip1->word, ip2->word); if (!result) { ip3 = (struct indexentry *) mergeindexentries(ip1, ip2); printindexentry(ip3, fp3); freeindexentry(ip1); freeindexentry(ip2); freeindexentry(ip3); buffer1 = buffer2 = NULL; skipwords++; } else if (result < 0) { printindexentry(ip1, fp3); freeindexentry(ip1); buffer1 = NULL; } else { printindexentry(ip2, fp3); freeindexentry(ip2); buffer2 = NULL; } } if (verbose) { if (skipwords) printf("%d redundant word%s.", skipwords, (skipwords == 1) ? "" : "s"); else printf("no redundant words."); } printstopwords(fp3); fclose(fp3); if (verbose) printf("\nMerging file info... "); fp3 = fopen(outfile, "a+"); offsets[FILELISTPOS] = ftell(fp3); for (i = j = 1; i <= indexfilenum1 + indexfilenum2; i++) if (getmap(i) == j) { addtofilehashlist(j++ - 1, ftell(fp3)); fprintf(fp3, "%s", lookupindexfilenum(i)); } skipfiles = (indexfilenum1 + indexfilenum2) - totalfiles; if (verbose) { if (skipfiles) printf("%d redundant file%s.", skipfiles, (skipfiles == 1) ? "" : "s"); else printf("no redundant files."); } printfileoffsets(fp3); fseek(fp3, offsetstart, 0); for (i = 0; i < MAXCHARS; i++) fprintf(fp3, "%016li", offsets[i]); fclose(fp3); fclose(fp1); fclose(fp2); if (verbose) printf("\nDone.\n"); } /* Gets the number of files in an index file. */ int getindexfilenum(fp) FILE *fp; { int i; char line[MAXSTRLEN]; readoffsets(fp); fseek(fp, offsets[FILELISTPOS], 0); i = 0; while(ftell(fp) != offsets[FILEOFFSETPOS]) { fgets(line, MAXSTRLEN, fp); i++; } return i; } /* This adds an offset to the file numbers in a particular ** result list. For instance, file 1 has file numbers going from ** 1 to 10, but so does file 2, so I have to add 10 to all the ** file numbers in file 2 before merging. */ void addfilenums(ip, num) struct indexentry *ip; int num; { struct result *rp; rp = ip->result; while (rp != NULL) { rp->filenum = encodefilenum(getmap(decodefilenum(rp->filenum) + num)); rp = rp->next; } } /* This reads the next line in the index file and puts the results ** in a result structure. */ struct indexentry *readindexline(fp, limit) FILE *fp; long limit; { int i, c, x, countnum, rank, filenum, structure; char fileword[MAXWORDLEN]; struct result *rp; struct indexentry *ip; rp = NULL; if (limit == ftell(fp)) return NULL; for (i = 0; (c = fgetc(fp)) != 0; ) { if (c == ':') { fileword[i] = '\0'; break; } else fileword[i++] = c; } countnum = 1; ungetc(c, fp); while ((c = fgetc(fp)) != 0) { x = 0; do { c = fgetc(fp); if (c == 0) break; x *= 128; x += c & 127; } while (c & 128); if (c == 0) break; if (x) { if (countnum == 1) { filenum = x; countnum++; } else if (countnum == 2) { rank = x; countnum++; } else if (countnum == 3) { structure = x; rp = (struct result *) addtoresultlist(rp, filenum, rank, structure); countnum = 1; } } } ip = (struct indexentry *) emalloc(sizeof(struct indexentry)); ip->word = (char *) mystrdup(fileword); ip->result = rp; return ip; } /* This puts all the file info into a hash table so that it can ** be looked up by its pathname and filenumber. This is how ** we find redundant file information. */ void addindexfilelist(num, info, totalfiles) int num; char *info; int *totalfiles; { int i; static int j; unsigned hashval; char tmpstr[MAXSTRLEN], path[MAXSTRLEN]; struct indexfileinfo *ip1, *ip2; strcpy(path, extractpath(info)); i = lookupindexfilepath(path); if (i != -1) { *totalfiles = *totalfiles - 1; remap(num, i); return; } remap(num, j + 1); j++; ip1 = (struct indexfileinfo *) emalloc(sizeof(struct indexfileinfo)); ip1->filenum = num; ip1->fileinfo = (char *) mystrdup(info); ip1->path = (char *) mystrdup(path); sprintf(tmpstr, "%d", num); hashval = bighash(tmpstr); ip1->next = indexfilehashlist[hashval]; indexfilehashlist[hashval] = ip1; ip2 = (struct indexfileinfo *) emalloc(sizeof(struct indexfileinfo)); ip2->filenum = num; ip2->fileinfo = (char *) mystrdup(info); ip2->path = (char *) mystrdup(path); hashval = bighash(path); ip2->next = indexfilehashlist[hashval]; indexfilehashlist[hashval] = ip2; } /* This extracts the pathname information from the file information ** line as stored in the index file. */ char *extractpath(s) char *s; { int i; static char path[MAXSTRLEN]; for (i = 0; s[i] && s[i] != '\"'; i++) path[i] = s[i]; path[i - 1] = '\0'; path[i] = '\0'; return path; } /* This returns the file information corresponding to a file number. */ char *lookupindexfilenum(num) int num; { unsigned hashval; char tmpstr[MAXSTRLEN]; struct indexfileinfo *ip; sprintf(tmpstr, "%d", num); hashval = bighash(tmpstr); ip = indexfilehashlist[hashval]; while (ip != NULL) { if (ip->filenum == num) return ip->fileinfo; ip = ip->next; } return NULL; } /* This returns the file number corresponding to a pathname. */ int lookupindexfilepath(path) char *path; { unsigned hashval; struct indexfileinfo *ip; hashval = bighash(path); ip = indexfilehashlist[hashval]; while (ip != NULL) { if (!strcmp(ip->path, path)) return ip->filenum; ip = ip->next; } return -1; } /* This simply concatenates two information lists that correspond ** to a word found in both index files. */ struct indexentry *mergeindexentries(ip1, ip2) struct indexentry *ip1; struct indexentry *ip2; { struct result *newrp, *rp1, *rp2; struct indexentry *ep; rp1 = ip1->result; rp2 = ip2->result; newrp = NULL; while (rp1 != NULL) { newrp = (struct result *) addtoresultlist(newrp, rp1->filenum, rp1->rank, rp1->structure); rp1 = rp1->next; } while (rp2 != NULL) { newrp = (struct result *) addtoresultlist(newrp, rp2->filenum, rp2->rank, rp2->structure); rp2 = rp2->next; } ep = (struct indexentry *) emalloc(sizeof(struct indexentry)); ep->word = (char *) mystrdup(ip1->word); ep->result = newrp; return ep; } /* This prints a new word entry into the merged index file, ** removing redundant file information as it goes along. */ void printindexentry(ip, fp) struct indexentry *ip; FILE *fp; { int i, num; struct result *rp; for (i = 0; indexchars[i] != '\0'; i++) if ((ip->word)[0] == indexchars[i] && !offsets[i]) offsets[i] = ftell(fp); fprintf(fp, "%s:", ip->word); initmarkentrylist(); rp = ip->result; while (rp != NULL) { num = rp->filenum; if (!ismarked(num)) { marknum(num); compress(num, fp); compress(rp->rank, fp); compress(rp->structure, fp); } rp = rp->next; } fputc(0, fp); } /* This associates a number with a new number. ** This function is used to remap file numbers from index ** files to a new merged index file. */ void remap(oldnum, newnum) int oldnum; int newnum; { unsigned hashval; char tmpstr[MAXSTRLEN]; struct mapentry *mp; mp = (struct mapentry *) emalloc(sizeof(struct mapentry)); mp->oldnum = oldnum; mp->newnum = newnum; sprintf(tmpstr, "%d", oldnum); hashval = bighash(tmpstr); mp->next = mapentrylist[hashval]; mapentrylist[hashval] = mp; } /* This retrieves the number associated with another. */ int getmap(num) int num; { unsigned hashval; char tmpstr[MAXSTRLEN]; struct mapentry *mp; sprintf(tmpstr, "%d", num); hashval = bighash(tmpstr); mp = mapentrylist[hashval]; while (mp != NULL) { if (mp->oldnum == num) return mp->newnum; mp = mp->next; } return num; } /* This marks a number as having been printed. */ void marknum(num) int num; { unsigned hashval; char tmpstr[MAXSTRLEN]; struct markentry *mp; mp = (struct markentry *) emalloc(sizeof(struct markentry)); mp->num = num; sprintf(tmpstr, "%d", num); hashval = bighash(tmpstr); mp->next = markentrylist[hashval]; markentrylist[hashval] = mp; } /* Has a number been printed? */ int ismarked(num) int num; { unsigned hashval; char tmpstr[MAXSTRLEN]; struct markentry *mp; sprintf(tmpstr, "%d", num); hashval = bighash(tmpstr); mp = markentrylist[hashval]; while (mp != NULL) { if (mp->num == num) return 1; mp = mp->next; } return 0; } /* Initialize the marking list. */ void initmarkentrylist() { int i; struct markentry *mp; for (i = 0; i < BIGHASHSIZE; i++) { mp = markentrylist[i]; if (mp != NULL) free(mp); markentrylist[i] = NULL; } } /* Initialize the main file list. */ void initindexfilehashlist() { int i; struct indexfileinfo *ip; for (i = 0; i < BIGHASHSIZE; i++) { ip = indexfilehashlist[i]; if (ip != NULL) free(ip); indexfilehashlist[i] = NULL; } } /* Frees up used index entries, my best attempt at memory management... ** I still have bytes leaking elsewhere... */ void freeindexentry(ip) struct indexentry *ip; { struct result *rp, *oldp; free(ip->word); rp = ip->result; while (rp != NULL) { oldp = rp; rp = rp->next; free(oldp); } free(ip); } /* Translates a file number into something that can be compressed. */ int encodefilenum(num) int num; { int i, j; for (i = j = 0; i != num; i++) { j++; if (!(j % 128)) j++; } return j; } /* Translates a compressed file number into a correct file number. */ int decodefilenum(num) int num; { int i, extra; for (i = 1, extra = 0; i < num; i++) if (!(i % 128)) { extra++; i++; } num -= extra; return num; }